In [1]:
# Import required libraries and dependencies
import pandas as pd
import hvplot.pandas
from sklearn.cluster import KMeans
from sklearn.decomposition import PCA
from sklearn.preprocessing import StandardScaler
In [2]:
# Load the data into a Pandas DataFrame
df_market_data = pd.read_csv(
    "Resources/crypto_market_data.csv",
    index_col="coin_id")

# Display sample data
df_market_data.head(40)
Out[2]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y
coin_id
bitcoin 1.08388 7.60278 6.57509 7.67258 -3.25185 83.51840 37.51761
ethereum 0.22392 10.38134 4.80849 0.13169 -12.88890 186.77418 101.96023
tether -0.21173 0.04935 0.00640 -0.04237 0.28037 -0.00542 0.01954
ripple -0.37819 -0.60926 2.24984 0.23455 -17.55245 39.53888 -16.60193
bitcoin-cash 2.90585 17.09717 14.75334 15.74903 -13.71793 21.66042 14.49384
binancecoin 2.10423 12.85511 6.80688 0.05865 36.33486 155.61937 69.69195
chainlink -0.23935 20.69459 9.30098 -11.21747 -43.69522 403.22917 325.13186
cardano 0.00322 13.99302 5.55476 10.10553 -22.84776 264.51418 156.09756
litecoin -0.06341 6.60221 7.28931 1.21662 -17.23960 27.49919 -12.66408
bitcoin-cash-sv 0.92530 3.29641 -1.86656 2.88926 -24.87434 7.42562 93.73082
crypto-com-chain 0.61209 -5.67151 -8.53948 -17.44782 -16.47600 226.70782 305.05908
usd-coin -0.17825 -0.11871 -0.00568 -0.16584 0.04271 -0.15691 -0.19205
eos 0.14477 -1.31177 1.13751 -4.63398 -30.16898 18.06111 -17.56753
monero 0.42961 15.78515 18.41097 38.95974 41.72500 169.52147 141.04116
tron 0.07647 4.23886 1.40337 -12.60389 5.52545 132.88436 59.23821
tezos -0.67316 8.95665 2.32062 -14.12663 -44.82248 43.42842 140.01279
okb -2.72700 -4.55389 -5.02662 -10.43847 -2.83120 39.95853 141.95791
stellar -1.00843 2.07149 -1.08217 -8.12933 -30.80369 84.62157 13.80715
cosmos -0.95103 16.08534 5.51074 4.57813 -7.20130 185.99786 82.43833
cdai 0.21169 0.05820 0.17076 -2.18147 0.65726 -0.39210 -0.28783
neo 0.49302 2.44243 -9.84803 -21.95472 13.51879 158.64773 131.29655
wrapped-bitcoin 1.10231 7.40537 6.55668 7.37557 -3.58772 83.90520 37.53424
leo-token -0.13192 -1.34886 -7.02859 3.07525 -7.54455 16.40588 21.00263
huobi-token -0.40818 1.61798 0.45488 -3.25488 -3.40689 42.23704 24.57164
nem -0.84990 -0.85140 2.64844 -3.65382 82.86094 216.17761 200.71797
binance-usd -0.10642 0.04726 0.05902 0.01843 0.09383 0.09959 0.13928
iota 0.41996 6.06830 -3.77714 -4.15281 -34.51894 95.02821 -3.98533
vechain 1.28766 -1.76352 -18.15890 -14.16831 -43.62359 269.70264 202.86827
zcash -0.60897 10.34780 5.79179 7.37007 -20.54216 122.54767 82.88499
theta-token -4.56089 -6.09456 -6.57354 31.43355 80.03112 882.65105 701.37599
dash -1.06006 5.09387 0.51708 -7.79140 -26.22460 6.23435 -2.45897
ethereum-classic -0.45950 3.05209 -1.26669 2.89572 -25.90799 10.39203 11.11094
ethlend -13.52786 4.21266 -9.80075 -29.99499 2.13917 2227.92782 7852.08970
maker -0.60285 8.15400 -1.48854 16.71360 -18.61722 82.53544 10.65279
havven -4.07216 4.33651 -13.29164 -20.84154 -39.42657 622.92465 678.78427
omisego 4.84033 6.82985 -13.21636 -9.11552 79.27505 585.26307 320.69054
celsius-degree-token 2.51323 0.60354 24.23919 140.79570 223.06437 1590.19149 2009.72217
ontology -1.35845 -1.21399 -10.56222 -34.70548 -32.70004 54.76717 -12.65493
ftx-token 0.83416 7.10438 -0.20629 -10.56394 5.90295 57.48950 168.37251
true-usd -0.06197 0.16642 0.10974 0.03090 0.25154 -0.08874 0.40617
In [3]:
# Generate summary statistics
df_market_data.describe()
Out[3]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y
count 41.000000 41.000000 41.000000 41.000000 41.000000 41.000000 41.000000
mean -0.269686 4.497147 0.185787 1.545693 -0.094119 236.537432 347.667956
std 2.694793 6.375218 8.376939 26.344218 47.365803 435.225304 1247.842884
min -13.527860 -6.094560 -18.158900 -34.705480 -44.822480 -0.392100 -17.567530
25% -0.608970 0.047260 -5.026620 -10.438470 -25.907990 21.660420 0.406170
50% -0.063410 3.296410 0.109740 -0.042370 -7.544550 83.905200 69.691950
75% 0.612090 7.602780 5.510740 4.578130 0.657260 216.177610 168.372510
max 4.840330 20.694590 24.239190 140.795700 223.064370 2227.927820 7852.089700
In [4]:
# Plot your data to see what's in your DataFrame
df_market_data.hvplot.line(
    width=800,
    height=400,
    rot=90
)
Out[4]:

Prepare the Data¶

In [5]:
# Use the `StandardScaler()` module from scikit-learn to normalize the data from the CSV file

df_market_data_scaled = StandardScaler().fit_transform(df_market_data[['price_change_percentage_24h','price_change_percentage_7d','price_change_percentage_14d','price_change_percentage_30d','price_change_percentage_60d','price_change_percentage_200d','price_change_percentage_1y']])
# Create a DataFrame with the scaled data
df_market_data_scaled = pd.DataFrame(df_market_data_scaled,columns=['price_change_percentage_24h','price_change_percentage_7d','price_change_percentage_14d','price_change_percentage_30d','price_change_percentage_60d','price_change_percentage_200d','price_change_percentage_1y'])
In [6]:
# Copy the crypto names from the original data
crypto_name_index = df_market_data.index

# Set the coinid column as index
df_market_data_scaled.index = crypto_name_index

# Display sample data
df_market_data_scaled
Out[6]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y
coin_id
bitcoin 0.508529 0.493193 0.772200 0.235460 -0.067495 -0.355953 -0.251637
ethereum 0.185446 0.934445 0.558692 -0.054341 -0.273483 -0.115759 -0.199352
tether 0.021774 -0.706337 -0.021680 -0.061030 0.008005 -0.550247 -0.282061
ripple -0.040764 -0.810928 0.249458 -0.050388 -0.373164 -0.458259 -0.295546
bitcoin-cash 1.193036 2.000959 1.760610 0.545842 -0.291203 -0.499848 -0.270317
binancecoin 0.891871 1.327295 0.800214 -0.057148 0.778653 -0.188232 -0.225533
chainlink 0.011397 2.572251 1.101647 -0.490495 -0.931954 0.387759 -0.018284
cardano 0.102530 1.508001 0.648885 0.328959 -0.486349 0.065080 -0.155428
litecoin 0.077497 0.334297 0.858520 -0.012646 -0.366477 -0.486266 -0.292351
bitcoin-cash-sv 0.448952 -0.190684 -0.248043 0.051634 -0.529666 -0.532961 -0.206029
crypto-com-chain 0.331280 -1.614844 -1.054521 -0.729931 -0.350155 -0.022866 -0.034570
usd-coin 0.034352 -0.733026 -0.023140 -0.065775 0.002925 -0.550599 -0.282232
eos 0.155710 -0.922491 0.115024 -0.237488 -0.642837 -0.508220 -0.296330
monero 0.262723 1.792602 2.202665 1.437842 0.893865 -0.155893 -0.167644
tron 0.130050 -0.041018 0.147155 -0.543776 0.120116 -0.241118 -0.234014
tezos -0.151583 0.708196 0.258012 -0.602296 -0.956049 -0.449211 -0.168479
okb -0.923203 -1.437359 -0.629963 -0.460558 -0.058504 -0.457283 -0.166900
stellar -0.277543 -0.385209 -0.153243 -0.371816 -0.656403 -0.353387 -0.270874
cosmos -0.255978 1.840274 0.643565 0.116538 -0.151913 -0.117565 -0.215191
cdai 0.180851 -0.704931 -0.001816 -0.143237 0.016060 -0.551146 -0.282310
neo 0.286546 -0.326301 -1.212670 -0.903134 0.290970 -0.181187 -0.175550
wrapped-bitcoin 0.515453 0.461843 0.769975 0.224045 -0.074674 -0.355054 -0.251623
leo-token 0.051758 -0.928381 -0.871918 0.058782 -0.159250 -0.512071 -0.265036
huobi-token -0.052032 -0.457229 0.032522 -0.184489 -0.070809 -0.451982 -0.262140
nem -0.217984 -0.849381 0.297632 -0.199820 1.773127 -0.047361 -0.119226
binance-usd 0.061339 -0.706669 -0.015321 -0.058694 0.004017 -0.550003 -0.281963
iota 0.259097 0.249508 -0.478953 -0.218997 -0.735815 -0.329179 -0.285310
vechain 0.585089 -0.994231 -2.217108 -0.603898 -0.930423 0.077149 -0.117482
zcash -0.127467 0.929119 0.677532 0.223834 -0.437068 -0.265163 -0.214829
theta-token -1.612188 -1.682027 -0.816921 1.148607 1.712641 1.502992 0.286977
dash -0.296940 0.094763 0.040040 -0.358830 -0.558527 -0.535732 -0.284071
ethereum-classic -0.071312 -0.229484 -0.175544 0.051882 -0.551760 -0.526060 -0.273062
ethlend -4.981042 -0.045178 -1.206956 -1.212126 0.047736 4.632380 6.088625
maker -0.125168 0.580730 -0.202356 0.582911 -0.395923 -0.358240 -0.273433
havven -1.428574 -0.025510 -1.628859 -0.860354 -0.840714 0.898815 0.268647
omisego 1.919812 0.370447 -1.619761 -0.409716 1.696480 0.811207 -0.021888
celsius-degree-token 1.045530 -0.618328 2.907054 5.351455 4.769913 3.148875 1.348488
ontology -0.409044 -0.906963 -1.298986 -1.393153 -0.696937 -0.422835 -0.292344
ftx-token 0.414711 0.414044 -0.047386 -0.465380 0.128185 -0.416502 -0.145469
true-usd 0.078038 -0.687745 -0.009191 -0.058214 0.007388 -0.550441 -0.281747
digibyte 1.217453 -0.607714 -0.907066 0.449939 -0.662530 0.572367 -0.132482
In [7]:
crypto_name_index
Out[7]:
Index(['bitcoin', 'ethereum', 'tether', 'ripple', 'bitcoin-cash',
       'binancecoin', 'chainlink', 'cardano', 'litecoin', 'bitcoin-cash-sv',
       'crypto-com-chain', 'usd-coin', 'eos', 'monero', 'tron', 'tezos', 'okb',
       'stellar', 'cosmos', 'cdai', 'neo', 'wrapped-bitcoin', 'leo-token',
       'huobi-token', 'nem', 'binance-usd', 'iota', 'vechain', 'zcash',
       'theta-token', 'dash', 'ethereum-classic', 'ethlend', 'maker', 'havven',
       'omisego', 'celsius-degree-token', 'ontology', 'ftx-token', 'true-usd',
       'digibyte'],
      dtype='object', name='coin_id')

Find the Best Value for k Using the Original Data.¶

In [8]:
# Create a list with the number of k-values from 1 to 11

k = list(range(1,11))
k
Out[8]:
[1, 2, 3, 4, 5, 6, 7, 8, 9, 10]
In [9]:
# Create an empty list to store the inertia values

inertia =[]
# Create a for loop to compute the inertia with each possible value of k
for i in k:
    k_model = KMeans(n_clusters = i,random_state=  1)
    k_model.fit(df_market_data_scaled)
    inertia.append(k_model.inertia_)
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_scaled`
# 3. Append the model.inertia_ to the inertia list
C:\Users\samit\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
In [10]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"K": k,"Inertia": inertia}
df_elbow_scaled = pd.DataFrame(elbow_data)

# Create a DataFrame with the data to plot the Elbow curve
df_elbow_scaled.head()
Out[10]:
K Inertia
0 1 287.000000
1 2 195.820218
2 3 123.190482
3 4 79.022435
4 5 65.405923
In [11]:
# Plot a line chart with all the inertia values computed with 
elbow_plot_orginal = df_elbow_scaled.hvplot.line(
    x="K", 
    y="Inertia", 
    title="Elbow Curve", 
    xticks=k)
df_elbow_scaled.hvplot.line(
    x="K", 
    y="Inertia", 
    title="Elbow Curve", 
    xticks=k)
# the different values of k to visually identify the optimal value for k.
Out[11]:

Answer the following question:¶

Question: What is the best value for k?

Answer: the best value for K would be 4 where the marginal benefit of additional cluster (K) provides least intertia


Cluster Cryptocurrencies with K-means Using the Original Data¶

In [12]:
# Initialize the K-Means model using the best value for k
model = KMeans(n_clusters= 4,random_state=1)
In [13]:
# Fit the K-Means model using the scaled data
model.fit(df_market_data_scaled)
Out[13]:
KMeans(n_clusters=4, random_state=1)
In [14]:
# Predict the clusters to group the cryptocurrencies using the scaled data
k_lower = model.predict(df_market_data_scaled)

# Print the resulting array of cluster values.
k_lower
Out[14]:
array([3, 3, 1, 1, 3, 3, 3, 3, 3, 1, 1, 1, 1, 3, 1, 3, 1, 1, 3, 1, 1, 3,
       1, 1, 1, 1, 1, 1, 3, 1, 1, 1, 0, 3, 1, 1, 2, 1, 1, 1, 1])
In [15]:
# Create a copy of the DataFrame
df_market_data_scaled_prediction = df_market_data_scaled.copy()
In [16]:
# Add a new column to the DataFrame with the predicted clusters
df_market_data_scaled_prediction['cluster_lower']=k_lower

# Display sample data
df_market_data_scaled_prediction.head()
Out[16]:
price_change_percentage_24h price_change_percentage_7d price_change_percentage_14d price_change_percentage_30d price_change_percentage_60d price_change_percentage_200d price_change_percentage_1y cluster_lower
coin_id
bitcoin 0.508529 0.493193 0.772200 0.235460 -0.067495 -0.355953 -0.251637 3
ethereum 0.185446 0.934445 0.558692 -0.054341 -0.273483 -0.115759 -0.199352 3
tether 0.021774 -0.706337 -0.021680 -0.061030 0.008005 -0.550247 -0.282061 1
ripple -0.040764 -0.810928 0.249458 -0.050388 -0.373164 -0.458259 -0.295546 1
bitcoin-cash 1.193036 2.000959 1.760610 0.545842 -0.291203 -0.499848 -0.270317 3
In [17]:
# Create a scatter plot using hvPlot by setting 
# `x="price_change_percentage_24h"` and `y="price_change_percentage_7d"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
cluster_plot_original =  df_market_data_scaled_prediction.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="cluster_lower",
    hover_cols ="coin_id"
    ).opts(yformatter = "%.0f")

df_market_data_scaled_prediction.hvplot.scatter(
    x="price_change_percentage_24h",
    y="price_change_percentage_7d",
    by="cluster_lower",
    hover_cols ="coin_id"
    ).opts(yformatter = "%.0f")
Out[17]:

Optimize Clusters with Principal Component Analysis.¶

In [18]:
# Create a PCA model instance and set `n_components=3`.

pca = PCA(n_components=3)
In [19]:
# Use the PCA model with `fit_transform` to reduce to 
# three principal components.
market_data_pca = pca.fit_transform(df_market_data_scaled)
# View the first five rows of the DataFrame. 
market_data_pca[:5]
Out[19]:
array([[-0.60066733,  0.84276006,  0.46159457],
       [-0.45826071,  0.45846566,  0.95287678],
       [-0.43306981, -0.16812638, -0.64175193],
       [-0.47183495, -0.22266008, -0.47905316],
       [-1.15779997,  2.04120919,  1.85971527]])
In [20]:
# Retrieve the explained variance to determine how much information 
# can be attributed to each principal component.
pca.explained_variance_ratio_
Out[20]:
array([0.3719856 , 0.34700813, 0.17603793])

Answer the following question:¶

Question: What is the total explained variance of the three principal components?

Answer: about 88% of the total variance is condensed into the 3 PCA variables

In [21]:
# Create a new DataFrame with the PCA data.
df_market_data_pca = pd.DataFrame(market_data_pca,columns = ['PCA1','PCA2','PCA3'])

# Creating a DataFrame with the PCA data

# Copy the crypto names from the original data
df_market_data_pca.index = crypto_name_index

# Set the coinid column as index


# Display sample data
df_market_data_pca
Out[21]:
PCA1 PCA2 PCA3
coin_id
bitcoin -0.600667 0.842760 0.461595
ethereum -0.458261 0.458466 0.952877
tether -0.433070 -0.168126 -0.641752
ripple -0.471835 -0.222660 -0.479053
bitcoin-cash -1.157800 2.041209 1.859715
binancecoin -0.516534 1.388377 0.804071
chainlink -0.450711 0.517699 2.846143
cardano -0.345600 0.729439 1.478013
litecoin -0.649468 0.432165 0.600303
bitcoin-cash-sv -0.759014 -0.201200 -0.217653
crypto-com-chain -0.248198 -1.376252 -1.462026
usd-coin -0.438408 -0.175337 -0.663388
eos -0.693425 -0.473815 -0.527597
monero 0.060499 2.909404 1.498571
tron -0.393352 -0.108192 -0.012756
tezos -0.796176 -0.494409 1.082812
okb 0.064075 -1.269825 -1.098829
stellar -0.489015 -0.732719 -0.062543
cosmos -0.306272 0.703415 1.714224
cdai -0.513528 -0.142802 -0.656566
neo -0.362120 -0.986914 -0.728752
wrapped-bitcoin -0.604265 0.827398 0.439316
leo-token -0.413296 -0.674115 -1.076628
huobi-token -0.407483 -0.212507 -0.351426
nem 0.608974 0.563532 -1.148742
binance-usd -0.450211 -0.151019 -0.647401
iota -0.764665 -0.517886 0.204990
vechain -0.556315 -1.938209 -1.261776
zcash -0.425147 0.492976 1.058048
theta-token 2.676868 -0.013954 -1.965207
dash -0.613923 -0.479337 0.339565
ethereum-classic -0.579924 -0.356334 -0.114942
ethlend 8.089018 -3.896891 2.301382
maker -0.389045 0.165041 0.379414
havven 0.865762 -2.261882 0.275583
omisego 0.111675 0.428316 -1.205398
celsius-degree-token 4.792395 6.767679 -1.986985
ontology -0.632355 -2.108117 -0.652227
ftx-token -0.593142 0.021485 0.209911
true-usd -0.458131 -0.135734 -0.635284
digibyte -0.297910 -0.191126 -0.909602

Find the Best Value for k Using the PCA Data¶

In [22]:
# Create a list with the number of k-values from 1 to 11
k = list(range(1,11))
In [23]:
# Create an empty list to store the inertia values
inertia=[]

# Create a for loop to compute the inertia with each possible value of k

for i in k:
    k_model = KMeans(n_clusters=i,random_state=1)
    k_model.fit(df_market_data_pca)
    inertia.append(k_model.inertia_)
# Inside the loop:
# 1. Create a KMeans model using the loop counter for the n_clusters
# 2. Fit the model to the data using `df_market_data_pca`
# 3. Append the model.inertia_ to the inertia list
C:\Users\samit\anaconda3\lib\site-packages\sklearn\cluster\_kmeans.py:1036: UserWarning: KMeans is known to have a memory leak on Windows with MKL, when there are less chunks than available threads. You can avoid it by setting the environment variable OMP_NUM_THREADS=1.
  warnings.warn(
In [24]:
# Create a dictionary with the data to plot the Elbow curve
elbow_data = {"k":k,"inertia":inertia}
df_elbow = pd.DataFrame(elbow_data)
# Create a DataFrame with the data to plot the Elbow curve
df_elbow.head()
Out[24]:
k inertia
0 1 256.874086
1 2 165.901994
2 3 93.774626
3 4 49.665497
4 5 38.352251
In [25]:
# Plot a line chart with all the inertia values computed with 
# the different values of k to visually identify the optimal value for k.
elbow_pca = df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
df_elbow.hvplot.line(
    x="k", 
    y="inertia", 
    title="Elbow Curve", 
    xticks=k
)
Out[25]:

Answer the following questions:¶

  • Question: What is the best value for k when using the PCA data?

    • Answer: the best value for K is 4 using the PCA data
  • Question: Does it differ from the best k value found using the original data?

    • Answer: no it doesn't differ from the best K found using the original data

Cluster Cryptocurrencies with K-means Using the PCA Data¶

In [26]:
# Initialize the K-Means model using the best value for k
mdoel = KMeans(n_clusters=4,random_state=1)
In [27]:
# Fit the K-Means model using the PCA data
model.fit(df_market_data_pca)
Out[27]:
KMeans(n_clusters=4, random_state=1)
In [28]:
# Predict the clusters to group the cryptocurrencies using the PCA data
k_3 = model.predict(df_market_data_pca)
# Print the resulting array of cluster values.
k_3
Out[28]:
array([1, 1, 0, 0, 1, 1, 1, 1, 1, 0, 0, 0, 0, 1, 0, 1, 0, 0, 1, 0, 0, 1,
       0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 2, 1, 0, 0, 3, 0, 0, 0, 0])
In [29]:
# Create a copy of the DataFrame with the PCA data
df_market_data_pca_predict = df_market_data_pca.copy()

# Add a new column to the DataFrame with the predicted clusters
df_market_data_pca_predict['market_segment'] = k_3

# Display sample data
df_market_data_pca_predict
Out[29]:
PCA1 PCA2 PCA3 market_segment
coin_id
bitcoin -0.600667 0.842760 0.461595 1
ethereum -0.458261 0.458466 0.952877 1
tether -0.433070 -0.168126 -0.641752 0
ripple -0.471835 -0.222660 -0.479053 0
bitcoin-cash -1.157800 2.041209 1.859715 1
binancecoin -0.516534 1.388377 0.804071 1
chainlink -0.450711 0.517699 2.846143 1
cardano -0.345600 0.729439 1.478013 1
litecoin -0.649468 0.432165 0.600303 1
bitcoin-cash-sv -0.759014 -0.201200 -0.217653 0
crypto-com-chain -0.248198 -1.376252 -1.462026 0
usd-coin -0.438408 -0.175337 -0.663388 0
eos -0.693425 -0.473815 -0.527597 0
monero 0.060499 2.909404 1.498571 1
tron -0.393352 -0.108192 -0.012756 0
tezos -0.796176 -0.494409 1.082812 1
okb 0.064075 -1.269825 -1.098829 0
stellar -0.489015 -0.732719 -0.062543 0
cosmos -0.306272 0.703415 1.714224 1
cdai -0.513528 -0.142802 -0.656566 0
neo -0.362120 -0.986914 -0.728752 0
wrapped-bitcoin -0.604265 0.827398 0.439316 1
leo-token -0.413296 -0.674115 -1.076628 0
huobi-token -0.407483 -0.212507 -0.351426 0
nem 0.608974 0.563532 -1.148742 0
binance-usd -0.450211 -0.151019 -0.647401 0
iota -0.764665 -0.517886 0.204990 0
vechain -0.556315 -1.938209 -1.261776 0
zcash -0.425147 0.492976 1.058048 1
theta-token 2.676868 -0.013954 -1.965207 0
dash -0.613923 -0.479337 0.339565 0
ethereum-classic -0.579924 -0.356334 -0.114942 0
ethlend 8.089018 -3.896891 2.301382 2
maker -0.389045 0.165041 0.379414 1
havven 0.865762 -2.261882 0.275583 0
omisego 0.111675 0.428316 -1.205398 0
celsius-degree-token 4.792395 6.767679 -1.986985 3
ontology -0.632355 -2.108117 -0.652227 0
ftx-token -0.593142 0.021485 0.209911 0
true-usd -0.458131 -0.135734 -0.635284 0
digibyte -0.297910 -0.191126 -0.909602 0
In [30]:
# Create a scatter plot using hvPlot by setting 
# `x="PC1"` and `y="PC2"`. 
# Color the graph points with the labels found using K-Means and 
# add the crypto name in the `hover_cols` parameter to identify 
# the cryptocurrency represented by each data point.
cluster_pca = df_market_data_pca_predict.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="market_segment"
)
df_market_data_pca_predict.hvplot.scatter(
    x="PCA1",
    y="PCA2",
    by="market_segment"
)
Out[30]:

Visualize and Compare the Results¶

In this section, you will visually analyze the cluster analysis results by contrasting the outcome with and without using the optimization techniques.

In [31]:
# Composite plot to contrast the Elbow curves
composit_elbow = elbow_pca + elbow_plot_orginal
composit_elbow
Out[31]:
In [32]:
# Composite plot to contrast the clusters
composit_cluster = cluster_pca + cluster_plot_original
composit_cluster
Out[32]:

Answer the following question:¶

  • Question: After visually analyzing the cluster analysis results, what is the impact of using fewer features to cluster the data using K-Means?

  • Answer: in terms of elbow curve the K means resulted the same number of segments, however, from cluster analysis we can see that the clusters are clearly distinguisible using fewer deatures to clusters ( original vs PCA clusert graph)

In [ ]: